library(tidyverse)
## -- Attaching packages -------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ----------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.3
library(plotly) 
## Warning: package 'plotly' was built under R version 3.5.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
options(scipen = 4)

First, I download the nlsy79 data from the course website and import dataset.

nlsy <- read_csv("nlsy79_income.csv")
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
# In the variables description file, we can see that there are over 70 variables and names are long, so I change the reference numbers to the question name.

colnames(nlsy) <- c("versionid",
    "caseid",
    "birth_country",
    "FAM-POB_1979",
    "FAM-3_1979",
    "FAM-3A_1979",
    "FAM-RES_1979",
    "FAM-6_1979",
    "R_REL-1_COL_1979",
    "expect_education",
    "army",
    "WOMENS-ROLES_000001_1979",
    "WOMENS-ROLES_000002_1979",
    "WOMENS-ROLES_000003_1979",
    "WOMENS-ROLES_000004_1979",
    "WOMENS-ROLES_000006_1979",
    "WOMENS-ROLES_000007_1979",
    "WOMENS-ROLES_000008_1979",
    "EXP-OCC_1979",
    "EXP-9_1979",
    "race",
    "gender",
    "MARSTAT-KEY_1979",
    "FAMSIZE_1979",
    "povstatus_1979",
    "police_1980",
    "POLIC-1C_1980",
    "POLICE-2_1980",
    "ALCH-2_1983",
    "num_drug_1984",
    "DS-9_1984",
    "Q13-5_TRUNC_REVISED_1990",
    "POVSTATUS_1990",
    "HGCREV90_1990",
    "jobs.num",
    "NUMCH90_1990",
    "AGEYCH90_1990",
    "DS-12_1998",
    "DS-13_1998",
    "INDALL-EMP.01_2000",
    "CPSOCC80.01_2000",
    "OCCSP-55I_CODE_2000",
    "Q2-15B_2000",
    "Q10-2_2000",
    "Q13-5_TRUNC_REVISED_2000",
    "FAMSIZE_2000",
    "TNFI_TRUNC_2000",
    "POVSTATUS_2000",
    "marriage_col_2000",
    "MARSTAT-KEY_2000",
    "MO1M1B_XRND",
    "Q2-10B~Y_2012",
    "jobtype_2012",
    "OCCALL-EMP.01_2012",
    "OCCSP-55I_CODE_2012",
    "Q2-15A_2012",
    "Q12-6_2012",
    "income",
    "Q13-5_SR000001_2012",
    "Q13-5_SR000002_2012",
    "Q13-18_TRUNC_2012",
    "Q13-18_SR000001_TRUNC_2012",
    "familysize_2012",
    "REGION_2012",
    "HGC_2012",
    "URBAN-RURAL_2012",
    "jobsnum_2012")

Part 1 Some data summaries before cleaning data

# Some table summaries
table.mean.income.0 <- nlsy %>%
  group_by(gender,race) %>%
  summarize(mean.income.0 = round(mean(income), 0))
table.mean.income.0
## # A tibble: 6 x 3
## # Groups:   gender [2]
##   gender  race mean.income.0
##    <dbl> <dbl>         <dbl>
## 1      1     1         30047
## 2      1     2         21417
## 3      1     3         30998
## 4      2     1         19661
## 5      2     2         17133
## 6      2     3         16232
# Some graphic summaries
ggplot(data=nlsy, aes(x=gender, y=income, colour = expect_education)) + geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

Discussion: We can see from the rough table and graphic summary dividing income level by gender and race that there are some problems: First, we did not recode factor variables from raw dataset so we do not know what does each number refers to. Second, there are a lot of negative values in raw datasets, which distort the acuuracy of our analysis. Third, we want to explore the differences in income by gender, the raw data hardly shows the result that we want.

Part 2 Data cleaning and Data Summary (with topcoded income)

# There are over 67 columns in our dataset but we do not need all of those so I first select useful columns that I may need

subset.nlsy <- nlsy %>%
  select("versionid","caseid","birth_country","expect_education","race","gender","income",
         "num_drug_1984","marriage_col_2000","familysize_2012","jobsnum_2012")


# Remove missing values to get a cleaner dataset
subset.nlsy[subset.nlsy < 0] <-NA

new.nlsy <- na.omit(subset.nlsy)
new.nlsy
## # A tibble: 6,342 x 11
##    versionid caseid birth_country expect_education  race gender income
##        <dbl>  <dbl>         <dbl>            <dbl> <dbl>  <dbl>  <dbl>
##  1       445      2             2               12     3      2  19000
##  2       445      3             1               14     3      2  35000
##  3       445      6             1               18     3      1 105000
##  4       445      8             1               14     3      2  40000
##  5       445      9             1               13     3      1  75000
##  6       445     14             1               17     3      2 102000
##  7       445     15             1               18     3      1      0
##  8       445     16             1               13     3      2  70000
##  9       445     17             1               14     3      1  60000
## 10       445     18             1               18     3      1 150000
## # ... with 6,332 more rows, and 4 more variables: num_drug_1984 <dbl>,
## #   marriage_col_2000 <dbl>, familysize_2012 <dbl>, jobsnum_2012 <dbl>

Discussion: I select 11 varibales that maybe useful for my analysis to create a new dataframe called new.nlsy and I remove all negative values(NA).

# Convert variables to factors and give the factors more meaningful levels

new.nlsy <- mutate(new.nlsy, 
                   race = recode_factor(race,
                                    `3` = "other",    
                                    `2` = "black",
                                    `1` = "hispanic"),
                   gender = recode_factor(gender, 
                                    `1` = "male",
                                    `2` = "female"))

new.nlsy <- mutate(new.nlsy, 
                   num_drug_1984 = recode_factor(num_drug_1984,
                                    `0` = "never",
                                    `1` = "1-9",
                                    `2` = "10-39",
                                    `3` = "40-99",
                                    `4` = "100-999",
                                    `5` = "1000+"),
                   marriage_col_2000 = recode_factor(marriage_col_2000,
                                    `2` = "married",
                                    `1` = "other",
                                    `3` = "other"),
                   birth_country = recode_factor(birth_country,
                                    `1` = "US",
                                    `2` = "Other"))
                   
new.nlsy <- mutate(new.nlsy, 
                   expect_education = recode_factor(expect_education, 
                                    `13`= "college",
                                    `14`= "college",
                                    `15`= "college",
                                    `16`= "college",
                                    `17`= "college",
                                    `18`= "college",
                                    `7`= "middle_high",
                                    `8`= "middle_high",
                                    `9`= "middle_high",
                                    `10`= "middle_high",
                                    `11`= "middle_high",
                                    `12`= "middle_high",
                                    `1`= "primary",
                                    `2`= "primary",
                                    `3`= "primary",
                                    `4`= "primary",
                                    `5`= "primary",
                                    `6`= "primary"
                                      ))

new.nlsy
## # A tibble: 6,342 x 11
##    versionid caseid birth_country expect_education race  gender income
##        <dbl>  <dbl> <fct>         <fct>            <fct> <fct>   <dbl>
##  1       445      2 Other         middle_high      other female  19000
##  2       445      3 US            college          other female  35000
##  3       445      6 US            college          other male   105000
##  4       445      8 US            college          other female  40000
##  5       445      9 US            college          other male    75000
##  6       445     14 US            college          other female 102000
##  7       445     15 US            college          other male        0
##  8       445     16 US            college          other female  70000
##  9       445     17 US            college          other male    60000
## 10       445     18 US            college          other male   150000
## # ... with 6,332 more rows, and 4 more variables: num_drug_1984 <fct>,
## #   marriage_col_2000 <fct>, familysize_2012 <dbl>, jobsnum_2012 <dbl>

Discussion: From the dataset, we cans see that most variables are categorical ones and for the convenience of my analysis, I recode some of them and choose baseline variables. For marriage status, I combine never married and other into other; for expected education, I combine 18 levels into primary school, middle_high school and college for better analysis.

# simple summary of the data

str(new.nlsy)
## Classes 'tbl_df', 'tbl' and 'data.frame':    6342 obs. of  11 variables:
##  $ versionid        : num  445 445 445 445 445 445 445 445 445 445 ...
##  $ caseid           : num  2 3 6 8 9 14 15 16 17 18 ...
##  $ birth_country    : Factor w/ 2 levels "US","Other": 2 1 1 1 1 1 1 1 1 1 ...
##  $ expect_education : Factor w/ 3 levels "college","middle_high",..: 2 1 1 1 1 1 1 1 1 1 ...
##  $ race             : Factor w/ 3 levels "other","black",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ gender           : Factor w/ 2 levels "male","female": 2 2 1 2 1 2 1 2 1 1 ...
##  $ income           : num  19000 35000 105000 40000 75000 102000 0 70000 60000 150000 ...
##  $ num_drug_1984    : Factor w/ 6 levels "never","1-9",..: 1 5 5 2 5 5 1 5 5 6 ...
##  $ marriage_col_2000: Factor w/ 2 levels "married","other": 1 1 1 2 1 2 1 1 2 2 ...
##  $ familysize_2012  : num  3 2 5 3 3 1 6 4 1 1 ...
##  $ jobsnum_2012     : num  3 17 12 13 5 24 9 12 14 4 ...
summary(new.nlsy)
##    versionid       caseid      birth_country    expect_education
##  Min.   :445   Min.   :    2   US   :5952    college    :3421   
##  1st Qu.:445   1st Qu.: 2479   Other: 390    middle_high:2895   
##  Median :445   Median : 4970                 primary    :  26   
##  Mean   :445   Mean   : 5294                                    
##  3rd Qu.:445   3rd Qu.: 7865                                    
##  Max.   :445   Max.   :12679                                    
##        race         gender         income       num_drug_1984 
##  other   :3221   male  :3024   Min.   :     0   never  :2459  
##  black   :1940   female:3318   1st Qu.:  1000   1-9    :1681  
##  hispanic:1181                 Median : 30000   10-39  : 707  
##                                Mean   : 42102   40-99  : 487  
##                                3rd Qu.: 56000   100-999: 572  
##                                Max.   :343830   1000+  : 436  
##  marriage_col_2000 familysize_2012   jobsnum_2012  
##  married:3577      Min.   : 1.000   Min.   : 0.00  
##  other  :2765      1st Qu.: 2.000   1st Qu.: 7.00  
##                    Median : 2.000   Median :11.00  
##                    Mean   : 2.627   Mean   :12.25  
##                    3rd Qu.: 3.000   3rd Qu.:16.00  
##                    Max.   :16.000   Max.   :58.00

Discussion: From str() fuction, we can see that some variables are recoded into factor variables.

# Make some tables to see the average income when broke down by gender and other factor variables

table.mean.income.1 <- new.nlsy %>%
  group_by(gender,race) %>%
  summarize(mean.income.1 = round(mean(income), 0))

kable(spread(table.mean.income.1, gender, mean.income.1), 
      format = "markdown")
race male female
other 70789 33937
black 34061 24108
hispanic 47884 29033
table.mean.income.2 <- new.nlsy %>%
  group_by(gender,expect_education) %>%
  summarize(mean.income.2 = round(mean(income), 0))

kable(spread(table.mean.income.2, gender, mean.income.2), 
      format = "markdown")
expect_education male female
college 73759 37968
middle_high 35401 20207
primary 23000 6102
table.mean.income.3 <- new.nlsy %>%
  group_by(gender,marriage_col_2000, familysize_2012) %>%
  summarize(mean.income.3 = round(mean(income), 0))
table.mean.income.3
## # A tibble: 42 x 4
## # Groups:   gender, marriage_col_2000 [4]
##    gender marriage_col_2000 familysize_2012 mean.income.3
##    <fct>  <fct>                       <dbl>         <dbl>
##  1 male   married                         1         49101
##  2 male   married                         2         60862
##  3 male   married                         3         72436
##  4 male   married                         4         86635
##  5 male   married                         5         88478
##  6 male   married                         6         73609
##  7 male   married                         7         32644
##  8 male   married                         8         31167
##  9 male   married                        11        114000
## 10 male   other                           1         32928
## # ... with 32 more rows
table.mean.income.4 <- new.nlsy %>%
  group_by(gender,birth_country, num_drug_1984) %>%
  summarize(mean.income.4 = round(mean(income), 0))
table.mean.income.4
## # A tibble: 24 x 4
## # Groups:   gender, birth_country [4]
##    gender birth_country num_drug_1984 mean.income.4
##    <fct>  <fct>         <fct>                 <dbl>
##  1 male   US            never                 56453
##  2 male   US            1-9                   58145
##  3 male   US            10-39                 65306
##  4 male   US            40-99                 56023
##  5 male   US            100-999               54894
##  6 male   US            1000+                 33795
##  7 male   Other         never                 46528
##  8 male   Other         1-9                   50854
##  9 male   Other         10-39                 92285
## 10 male   Other         40-99                 89535
## # ... with 14 more rows

Discussion: In order to investigate income differences by gender, I create some tables exploring the relationship between average income and gender, race, expected education and so on. For example, in table four, the average income of male born in US having 10-39 times of drug even higher than who never has drug not as commonly thought. Therefore, we should further discuss what factors influence the income differences by gender.

# Some graphics showing the relationship between gender and income

#qplot
qplot(x=gender, y=income, data=new.nlsy,
      color = race,
      shape = race,
      xlab = "gender",
      ylab = "income in 2012") 

# histogram
ggplot(new.nlsy, aes(x = income)) + xlab("income in 2012") + geom_histogram(aes(fill = gender))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

w <- ggplot(new.nlsy, aes(x = income)) + xlab("income in 2012") + geom_histogram(aes(fill = race))
ggplotly(w)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# boxplot and violin plot
ggplot(data=new.nlsy, aes(x=expect_education, y=income, colour = gender)) + geom_boxplot()

s <- ggplot(data=new.nlsy, aes(x=race, y=income, colour = gender)) + geom_boxplot()
ggplotly(s)
plot.1 <- ggplot(new.nlsy, aes(x = as.factor(familysize_2012), y = income)) +
  xlab("familysize in 2012") +
  ylab("income in 2012")
plot.1 + geom_boxplot()

# If I want to visualize the mean table did in the last part, for example, the barchart of race with income different by gender.

plot.2 <- ggplot(data = table.mean.income.1, 
                aes(y = mean.income.1, x = race, fill = gender))
color.2 <- c("#D55E00", "#0072B2")
plot.2 + geom_bar(stat = "identity", position = "dodge") +
  ylab("average income in 2012") + 
  xlab("race") +
  guides(fill = guide_legend(title = "people's average income in 2012")) + 
  scale_fill_manual(values=color.2)

# If I want to visualize the mean table, for example, the barchart of expect_education with income different by gender.

table.mean.income.5 <- new.nlsy %>%
  group_by(gender,familysize_2012) %>%
  summarize(mean.income.5 = round(mean(income), 0))
table.mean.income.5
## # A tibble: 24 x 3
## # Groups:   gender [2]
##    gender familysize_2012 mean.income.5
##    <fct>            <dbl>         <dbl>
##  1 male                 1         36773
##  2 male                 2         51510
##  3 male                 3         59732
##  4 male                 4         78366
##  5 male                 5         78242
##  6 male                 6         71082
##  7 male                 7         25520
##  8 male                 8         23375
##  9 male                 9             0
## 10 male                10             0
## # ... with 14 more rows
plot.3 <- ggplot(data = table.mean.income.5, 
                aes(y = mean.income.5, x = familysize_2012, fill = gender))
color.3 <- c("#009E73", "#999999")
plot.3 + geom_bar(stat = "identity", position = "dodge") +
  ylab("average income in 2012") + 
  xlab("family size in 2012") +
  guides(fill = guide_legend(title = "people's average income in 2012")) + 
  scale_fill_manual(values=color.3)

# If I want to visualize the association between income and familysize_2012 depending on gender
ggplot(new.nlsy, 
       aes(x=familysize_2012, y=income, shape=gender, color=gender)) + 
  geom_point() + 
  geom_smooth(method = "lm") + 
  ylab("income in 2012") + 
  xlab("number of times having drug in 1984") +
  ggtitle("income in 2012 by drug condition") 

# density plot 
qplot(fill = gender, x = income, data = new.nlsy, geom = "density", 
      alpha = I(0.5),
      adjust = 1.5, 
      xlim = c(-4, 6))
## Warning: Removed 4825 rows containing non-finite values (stat_density).

Dicussion: From various kinds of ggplots, we can roughly see that income/average income in 2012 is not normally distributed bewtween male and female.We can see that over 1500 respondents’s income equal zero. Topcoded income(outlier) somewhat affects the distribution of income broken by gender. People in larger family size are more likely to earn higher income. Higher level of expected education are more likely to have higher income.

significance level, t-test, p-value etc.

# Find whether there is any correlations between familysize and income
cor(new.nlsy$familysize_2012,new.nlsy$income)
## [1] 0.08361683
# Does the correlation vary by gender?
new.nlsy %>%
  group_by(gender) %>%
  summarize(cor_income_family = cor(income, familysize_2012))
## # A tibble: 2 x 2
##   gender cor_income_family
##   <fct>              <dbl>
## 1 male              0.179 
## 2 female           -0.0344
# Does the correlation vary by education and num_drug_1984?
new.nlsy %>%
  group_by(expect_education, num_drug_1984) %>%
  summarize(cor_income_family = cor(income, familysize_2012))
## # A tibble: 14 x 3
## # Groups:   expect_education [3]
##    expect_education num_drug_1984 cor_income_family
##    <fct>            <fct>                     <dbl>
##  1 college          never                   0.0685 
##  2 college          1-9                     0.139  
##  3 college          10-39                   0.0122 
##  4 college          40-99                   0.208  
##  5 college          100-999                 0.169  
##  6 college          1000+                  -0.0310 
##  7 middle_high      never                  -0.00866
##  8 middle_high      1-9                     0.0647 
##  9 middle_high      10-39                   0.149  
## 10 middle_high      40-99                   0.102  
## 11 middle_high      100-999                 0.00399
## 12 middle_high      1000+                   0.103  
## 13 primary          never                  -0.359  
## 14 primary          1-9                    -1

Discussion: I decide to run some correlations between income, gender and other factor variables.Broken down by gender, the correlation between income and familysize within female is -0.0344, which is suprising. Also, when broken down by expected education and number of drugs used in 1984, people who are expected to have college, havig drugs 1000+ times and who are expected to have middle_high never had drug before show negative correlation too.

# Testing differences in income bewteen male and female
qplot(x = gender, y = income,
      geom = "boxplot", data = new.nlsy,
      xlab = "gender", 
      ylab = "income in 2012",
      fill = I("lightblue"))

Discussion: First I use boxplot to see the distribution of income level broken down by gender. Generally, male’s income is high than female’s. The median of male’s income is around 50000 and the median of female’s income is around 25000.

# Find the mean, standard deviation and standard errors to see wether the relationship between gender and income is statistically significant
new.nlsy %>%
  group_by(gender) %>%
  summarize(num.obs = n(),
            mean.income = round(mean(income), 0),
            sd.income = round(sd(income), 0),
            se.income = round(sd(income) / sqrt(num.obs), 0))
## # A tibble: 2 x 5
##   gender num.obs mean.income sd.income se.income
##   <fct>    <int>       <dbl>     <dbl>     <dbl>
## 1 male      3024       55384     70465      1281
## 2 female    3318       29996     35882       623

Discussion: we can see that the average income of male is much more larger than average income of female, but male’s standard deviation is much larger than female’s. Female’s standard errors in income is very small, whcih means that the sample mean is very close to population mean.

# Run a two-sample t-test
income.t.test <- t.test(income ~ gender, data = new.nlsy)
income.t.test
## 
##  Welch Two Sample t-test
## 
## data:  income by gender
## t = 17.819, df = 4396.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  22594.80 28181.37
## sample estimates:
##   mean in group male mean in group female 
##             55384.15             29996.07

Discussion: We can see that the p value is smaller than 2.2e-16, which means that we are 95% confident that the difference in mean between male and female is statistically significant.

# p value
income.t.test$p.value
## [1] 1.23916e-68

Discussion: The ttest is very small.

# group means in male and female
income.t.test$estimate
##   mean in group male mean in group female 
##             55384.15             29996.07

Discussion: There is significant differences between average income between male and female (55384 and 29996). We are 95% confident that averge income in male is $25388 higher than in female.

# confidence interval for the difference
income.t.test$conf.int
## [1] 22594.80 28181.37
## attr(,"conf.level")
## [1] 0.95

Discussion: The confidence interval is [22594.80,28181.37]

# Also, try to run a wilcox test
income.wilcox.test <- wilcox.test(income ~ gender, data=new.nlsy, conf.int=TRUE)
income.wilcox.test
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  income by gender
## W = 6232500, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  12000 16000
## sample estimates:
## difference in location 
##                  14000

Discussion: When running a wilcoxcon test, the p value is also smaller than 2.2e-16, which is statitiscally significant.

# exlpore whether data is normal
ggplot(data = new.nlsy, aes(sample = income)) + stat_qq() + stat_qq_line() + facet_grid(. ~ gender)

Discussion: First we can see from the qqplot that there is extreme high income. Also, it appears that our data is not normally distributed on the regression line.

ANOVA

# Use ANOVA to test whether there is a significant association between gender and income
summary(aov(income ~ gender, data = new.nlsy))
##               Df    Sum Sq       Mean Sq F value Pr(>F)    
## gender         1 1.020e+12 1019745466082   335.3 <2e-16 ***
## Residuals   6340 1.928e+13    3041129420                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Use ANOVA to test whether there is a significant association between gender, race, expect_education and income
summary(aov(income ~ gender + race + expect_education, data = new.nlsy))
##                    Df    Sum Sq       Mean Sq F value Pr(>F)    
## gender              1 1.020e+12 1019745466082   369.3 <2e-16 ***
## race                2 6.454e+11  322709466221   116.9 <2e-16 ***
## expect_education    2 1.138e+12  569025755417   206.1 <2e-16 ***
## Residuals        6336 1.750e+13    2761567247                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Use ANOVA to test whether there is a significant association between gender, num_drug_1984, marriage_col_2000 and income
summary(aov(income ~ gender + num_drug_1984 + marriage_col_2000, data = new.nlsy))
##                     Df    Sum Sq       Mean Sq F value   Pr(>F)    
## gender               1 1.020e+12 1019745466082   348.2  < 2e-16 ***
## num_drug_1984        5 1.729e+11   34575426088    11.8 2.19e-11 ***
## marriage_col_2000    1 5.566e+11  556603604980   190.0  < 2e-16 ***
## Residuals         6334 1.855e+13    2928841141                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Discussion: We can see from the three outputs taht all p values are snaller than 2e-16, which means that the p-value is significant at the 0.05 level, there is an association between income and gender, race, num_drug_1984, marriage_col_2000 and so on.

** Linear regression**

# regress outcome variable income against all other variables

lm.1 <- lm(income ~ ., data = new.nlsy)
lm.1
## 
## Call:
## lm(formula = income ~ ., data = new.nlsy)
## 
## Coefficients:
##                 (Intercept)                    versionid  
##                  83904.5394                           NA  
##                      caseid           birth_countryOther  
##                     -0.3244                    4245.4622  
## expect_educationmiddle_high      expect_educationprimary  
##                 -25934.2485                  -39654.4835  
##                   raceblack                 racehispanic  
##                 -17276.9106                  -10195.5902  
##                genderfemale             num_drug_19841-9  
##                 -27623.3063                    2201.8980  
##          num_drug_198410-39           num_drug_198440-99  
##                   7427.0388                    2789.2523  
##        num_drug_1984100-999           num_drug_19841000+  
##                    629.6002                   -8180.1443  
##      marriage_col_2000other              familysize_2012  
##                  -9017.5803                    1773.3934  
##                jobsnum_2012  
##                   -698.4010
summary(lm.1)
## 
## Call:
## lm(formula = income ~ ., data = new.nlsy)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98972 -28207  -7776  14733 320924 
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate  Std. Error t value Pr(>|t|)    
## (Intercept)                  83904.5394   2634.4909  31.848  < 2e-16 ***
## versionid                            NA          NA      NA       NA    
## caseid                          -0.3244      0.2326  -1.394 0.163245    
## birth_countryOther            4245.4622   2930.5040   1.449 0.147467    
## expect_educationmiddle_high -25934.2485   1327.7997 -19.532  < 2e-16 ***
## expect_educationprimary     -39654.4835  10431.5005  -3.801 0.000145 ***
## raceblack                   -17276.9106   1751.4076  -9.865  < 2e-16 ***
## racehispanic                -10195.5902   2080.5784  -4.900 9.80e-07 ***
## genderfemale                -27623.3063   1335.6608 -20.681  < 2e-16 ***
## num_drug_19841-9              2201.8980   1659.1692   1.327 0.184521    
## num_drug_198410-39            7427.0388   2241.5209   3.313 0.000927 ***
## num_drug_198440-99            2789.2523   2608.9259   1.069 0.285057    
## num_drug_1984100-999           629.6002   2469.0916   0.255 0.798737    
## num_drug_19841000+           -8180.1443   2799.7279  -2.922 0.003493 ** 
## marriage_col_2000other       -9017.5803   1459.4575  -6.179 6.86e-10 ***
## familysize_2012               1773.3934    489.0990   3.626 0.000290 ***
## jobsnum_2012                  -698.4010     96.0148  -7.274 3.92e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51830 on 6326 degrees of freedom
## Multiple R-squared:  0.1628, Adjusted R-squared:  0.1608 
## F-statistic: 81.99 on 15 and 6326 DF,  p-value: < 2.2e-16
kable(summary(lm.1)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
Estimate Std. Error t value Pr(>|t|)
(Intercept) 83904.539 2634.491 31.848 0.0000
caseid -0.324 0.233 -1.394 0.1632
birth_countryOther 4245.462 2930.504 1.449 0.1475
expect_educationmiddle_high -25934.248 1327.800 -19.532 0.0000
expect_educationprimary -39654.483 10431.500 -3.801 0.0001
raceblack -17276.911 1751.408 -9.865 0.0000
racehispanic -10195.590 2080.578 -4.900 0.0000
genderfemale -27623.306 1335.661 -20.681 0.0000
num_drug_19841-9 2201.898 1659.169 1.327 0.1845
num_drug_198410-39 7427.039 2241.521 3.313 0.0009
num_drug_198440-99 2789.252 2608.926 1.069 0.2851
num_drug_1984100-999 629.600 2469.092 0.255 0.7987
num_drug_19841000+ -8180.144 2799.728 -2.922 0.0035
marriage_col_2000other -9017.580 1459.458 -6.179 0.0000
familysize_2012 1773.393 489.099 3.626 0.0003
jobsnum_2012 -698.401 96.015 -7.274 0.0000

Discussion: Now we run a regression model, regress income on gender and all other factor variables.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1608, which means that on average 16% of the variance of income can be explained by other factors.

# regress outcome variable income against gender, race, expect_education, marriage and familsize

lm.2 <- lm(income ~ gender + race + expect_education + marriage_col_2000 + familysize_2012, data = new.nlsy)
lm.2
## 
## Call:
## lm(formula = income ~ gender + race + expect_education + marriage_col_2000 + 
##     familysize_2012, data = new.nlsy)
## 
## Coefficients:
##                 (Intercept)                 genderfemale  
##                       74260                       -26224  
##                   raceblack                 racehispanic  
##                      -17845                       -10342  
## expect_educationmiddle_high      expect_educationprimary  
##                      -25525                       -34765  
##      marriage_col_2000other              familysize_2012  
##                      -10464                         2018
summary(lm.2)
## 
## Call:
## lm(formula = income ~ gender + race + expect_education + marriage_col_2000 + 
##     familysize_2012, data = new.nlsy)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90406 -27793  -7760  14470 321386 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  74260.1     1943.4  38.211  < 2e-16 ***
## genderfemale                -26224.1     1314.4 -19.952  < 2e-16 ***
## raceblack                   -17844.8     1566.5 -11.391  < 2e-16 ***
## racehispanic                -10341.8     1802.8  -5.737 1.01e-08 ***
## expect_educationmiddle_high -25525.1     1328.3 -19.217  < 2e-16 ***
## expect_educationprimary     -34765.3    10338.2  -3.363 0.000776 ***
## marriage_col_2000other      -10464.4     1456.5  -7.184 7.52e-13 ***
## familysize_2012               2018.3      490.7   4.113 3.95e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52170 on 6334 degrees of freedom
## Multiple R-squared:  0.1509, Adjusted R-squared:   0.15 
## F-statistic: 160.8 on 7 and 6334 DF,  p-value: < 2.2e-16
kable(summary(lm.2)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
Estimate Std. Error t value Pr(>|t|)
(Intercept) 74260.134 1943.405 38.211 0.0000
genderfemale -26224.055 1314.356 -19.952 0.0000
raceblack -17844.845 1566.510 -11.391 0.0000
racehispanic -10341.816 1802.806 -5.737 0.0000
expect_educationmiddle_high -25525.122 1328.265 -19.217 0.0000
expect_educationprimary -34765.265 10338.199 -3.363 0.0008
marriage_col_2000other -10464.380 1456.524 -7.184 0.0000
familysize_2012 2018.251 490.669 4.113 0.0000

Discussion: Now we run a regression model, regress income on gender and all other factor variables.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1608, which means that on average 16% of the variance of income can be explained by other factors.

# Plot the linear regression model
plot(lm.2)

# Collinearity and pairs plots
income.var.names.1 <- c("gender", "race", "expect_education", "marriage_col_2000", "familysize_2012")

pairs(new.nlsy[,income.var.names.1])

Discussion: Now we run a regression model, regress income on gender race, expected eduacation, marriage status and familisize. From the residuals fitted graph, we can see that although there are some outliers, generally the residuals have comparatively fan-shape variance, it is not very constant. From the qqplot, I use a linear model for prediction even if the underlying normality assumptions do not hold. Inthis case, the model is not best fitted but seems good. Scale-location plot This is another version of the residuals vs fitted plot. For residuals vs leverage, we can see that high residuals and high leverage (outliers) skew the model fit away from the rest of data.

From the colinearity table, we can see that in this model, the colinear relationship between each variable is very weak.

# regress outcome variable income against gender, race, expect_education, drug and familsize

lm.3 <- lm(income ~ gender + race + expect_education + num_drug_1984 + familysize_2012, data = new.nlsy)
lm.3
## 
## Call:
## lm(formula = income ~ gender + race + expect_education + num_drug_1984 + 
##     familysize_2012, data = new.nlsy)
## 
## Coefficients:
##                 (Intercept)                 genderfemale  
##                     69714.4                     -27574.2  
##                   raceblack                 racehispanic  
##                    -21018.3                     -11937.9  
## expect_educationmiddle_high      expect_educationprimary  
##                    -25983.6                     -35323.4  
##            num_drug_19841-9           num_drug_198410-39  
##                       702.8                       5630.0  
##          num_drug_198440-99         num_drug_1984100-999  
##                       297.7                      -2948.0  
##          num_drug_19841000+              familysize_2012  
##                    -13469.7                       2979.1
summary(lm.3)
## 
## Call:
## lm(formula = income ~ gender + race + expect_education + num_drug_1984 + 
##     familysize_2012, data = new.nlsy)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -93548 -28093  -7486  14941 322528 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  69714.4     2095.8  33.264  < 2e-16 ***
## genderfemale                -27574.2     1341.4 -20.557  < 2e-16 ***
## raceblack                   -21018.3     1511.7 -13.903  < 2e-16 ***
## racehispanic                -11937.9     1799.3  -6.635 3.52e-11 ***
## expect_educationmiddle_high -25983.6     1327.6 -19.572  < 2e-16 ***
## expect_educationprimary     -35323.4    10372.8  -3.405 0.000665 ***
## num_drug_19841-9               702.8     1660.7   0.423 0.672146    
## num_drug_198410-39            5630.0     2244.6   2.508 0.012158 *  
## num_drug_198440-99             297.7     2610.2   0.114 0.909184    
## num_drug_1984100-999         -2948.0     2458.5  -1.199 0.230522    
## num_drug_19841000+          -13469.7     2764.9  -4.872 1.13e-06 ***
## familysize_2012               2979.1      469.3   6.348 2.34e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52240 on 6330 degrees of freedom
## Multiple R-squared:  0.1491, Adjusted R-squared:  0.1476 
## F-statistic: 100.8 on 11 and 6330 DF,  p-value: < 2.2e-16
kable(summary(lm.3)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
Estimate Std. Error t value Pr(>|t|)
(Intercept) 69714.376 2095.816 33.264 0.0000
genderfemale -27574.157 1341.352 -20.557 0.0000
raceblack -21018.264 1511.739 -13.903 0.0000
racehispanic -11937.941 1799.327 -6.635 0.0000
expect_educationmiddle_high -25983.624 1327.618 -19.572 0.0000
expect_educationprimary -35323.360 10372.841 -3.405 0.0007
num_drug_19841-9 702.834 1660.665 0.423 0.6721
num_drug_198410-39 5629.991 2244.599 2.508 0.0122
num_drug_198440-99 297.748 2610.175 0.114 0.9092
num_drug_1984100-999 -2948.049 2458.490 -1.199 0.2305
num_drug_19841000+ -13469.729 2764.865 -4.872 0.0000
familysize_2012 2979.142 469.338 6.348 0.0000

Discussion: Now we run a regression model, regress income on gender, race, expect_eudcation, num_drug_1984 and familysize.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. But the p value for number of times having drug is very high, which suggests that drug maybe not a significant factor.The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1476, which means that on average 15% of the variance of income can be explained by other factors.

# regress outcome variable income against gender, race, expect_education, jobsnum, marriage and familysize

lm.4 <- lm(income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + familysize_2012, data = new.nlsy)
lm.4
## 
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education + 
##     marriage_col_2000 + familysize_2012, data = new.nlsy)
## 
## Coefficients:
##                 (Intercept)                 genderfemale  
##                     84405.5                     -27093.3  
##                   raceblack                 racehispanic  
##                    -18556.2                     -10712.6  
##                jobsnum_2012  expect_educationmiddle_high  
##                      -723.9                     -26361.8  
##     expect_educationprimary       marriage_col_2000other  
##                    -38791.6                      -9352.6  
##             familysize_2012  
##                      1782.4
summary(lm.4)
## 
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education + 
##     marriage_col_2000 + familysize_2012, data = new.nlsy)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96493 -28371  -7669  14704 322047 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  84405.46    2340.59  36.062  < 2e-16 ***
## genderfemale                -27093.30    1313.21 -20.631  < 2e-16 ***
## raceblack                   -18556.15    1562.08 -11.879  < 2e-16 ***
## racehispanic                -10712.56    1795.21  -5.967 2.54e-09 ***
## jobsnum_2012                  -723.92      94.01  -7.700 1.57e-14 ***
## expect_educationmiddle_high -26361.75    1326.65 -19.871  < 2e-16 ***
## expect_educationprimary     -38791.59   10304.23  -3.765 0.000168 ***
## marriage_col_2000other       -9352.59    1457.04  -6.419 1.47e-10 ***
## familysize_2012               1782.42     489.39   3.642 0.000273 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51930 on 6333 degrees of freedom
## Multiple R-squared:  0.1588, Adjusted R-squared:  0.1577 
## F-statistic: 149.4 on 8 and 6333 DF,  p-value: < 2.2e-16
kable(summary(lm.3)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
Estimate Std. Error t value Pr(>|t|)
(Intercept) 69714.376 2095.816 33.264 0.0000
genderfemale -27574.157 1341.352 -20.557 0.0000
raceblack -21018.264 1511.739 -13.903 0.0000
racehispanic -11937.941 1799.327 -6.635 0.0000
expect_educationmiddle_high -25983.624 1327.618 -19.572 0.0000
expect_educationprimary -35323.360 10372.841 -3.405 0.0007
num_drug_19841-9 702.834 1660.665 0.423 0.6721
num_drug_198410-39 5629.991 2244.599 2.508 0.0122
num_drug_198440-99 297.748 2610.175 0.114 0.9092
num_drug_1984100-999 -2948.049 2458.490 -1.199 0.2305
num_drug_19841000+ -13469.729 2764.865 -4.872 0.0000
familysize_2012 2979.142 469.338 6.348 0.0000

Discussion: Now we run a regression model, regress income on gender, race, expect_eudcation, jobsnum, marriage and familysize.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. But the p value for number of times having drug is very high, which suggests that drug maybe not a significant factor.The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1577, which means that on average 16% of the variance of income can be explained by other factors.

Interpret the coefficient of genderfemale: on average, hold other variables constant, the income of female is $27574 less than male.

# Calculate race-specific intercepts
intercepts <- c(coef(lm.4)["(Intercept)"],
                coef(lm.4)["(Intercept)"] + coef(lm.4)["raceblack"],
                coef(lm.4)["(Intercept)"] + coef(lm.4)["racehispanic"])

lines.df <- data.frame(intercepts = intercepts,
                       slopes = rep(coef(lm.4)["jobsnum_2012"], 3),
                       race = levels(new.nlsy$race))

qplot(x = jobsnum_2012, y = income, color = race, data = new.nlsy) + 
  geom_abline(aes(intercept = intercepts, 
                  slope = slopes, 
                  color = race), data = lines.df)

Interpreting the coefficient for race: The baseline race is other, therefore at this time for raceblack and racehispanic, Among people of the same jobs number in 2012, income of racehispanic people are on average higher than income of raceblack people.

# Plot the linear regression model
plot(lm.4)

# Collinearity and pairs plots
income.var.names.3 <- c("gender", "race", "jobsnum_2012", "expect_education", "familysize_2012","marriage_col_2000")

pairs(new.nlsy[,income.var.names.3])

Discussion: Now we run a regression model, regress income on gender race, expected eduacation, marriage status and familisize. From the residuals fitted graph, we can see that although there are some outliers, generally the residuals have comparatively fan-shape variance, it is not very constant. From the qqplot, I use a linear model for prediction even if the underlying normality assumptions do not hold. Inthis case, the model is not best fitted but seems good. Scale-location plot This is another version of the residuals vs fitted plot. For residuals vs leverage, we can see that high residuals and high leverage (outliers) skew the model fit away from the rest of data.

From the colinearity table, this time although the colinear relationship between different factor varibales is still weak, it is better than former model. We can see there are some colinearity between jobsnum_2012, expected education and familysize.

Do some interactions in regression

#Whether race, marriage,familysize is a significant predictor of income differences between male and female

anova(update(lm.4, . ~ . - race), lm.4)
## Analysis of Variance Table
## 
## Model 1: income ~ gender + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012
##   Res.Df            RSS Df    Sum of Sq      F    Pr(>F)    
## 1   6335 17470336498276                                     
## 2   6333 17077456571730  2 392879926546 72.848 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(update(lm.4, . ~ . - marriage_col_2000), lm.4)
## Analysis of Variance Table
## 
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012
##   Res.Df            RSS Df    Sum of Sq      F    Pr(>F)    
## 1   6334 17188562072246                                     
## 2   6333 17077456571730  1 111105500517 41.202 1.472e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(update(lm.4, . ~ . - familysize_2012), lm.4)
## Analysis of Variance Table
## 
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012
##   Res.Df            RSS Df   Sum of Sq      F    Pr(>F)    
## 1   6334 17113227527406                                    
## 2   6333 17077456571730  1 35770955677 13.265 0.0002725 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Discussion: We can see from the three ANOVA outputs that the p values are smaller than 0.05, which means that there is statistically significance.

#interact gender with race

lm.4.interact <- update(lm.4, . ~ . + gender*race)

summary(lm.4.interact)
## 
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education + 
##     marriage_col_2000 + familysize_2012 + gender:race, data = new.nlsy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -100482  -27473   -8069   14292  329293 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  90030.91    2416.78  37.252  < 2e-16 ***
## genderfemale                -37888.59    1821.40 -20.802  < 2e-16 ***
## raceblack                   -31553.64    2187.30 -14.426  < 2e-16 ***
## racehispanic                -19553.25    2564.93  -7.623 2.84e-14 ***
## jobsnum_2012                  -667.93      93.69  -7.129 1.12e-12 ***
## expect_educationmiddle_high -26221.48    1319.07 -19.879  < 2e-16 ***
## expect_educationprimary     -39710.29   10249.31  -3.874 0.000108 ***
## marriage_col_2000other       -9924.22    1450.25  -6.843 8.48e-12 ***
## familysize_2012               1556.81     487.30   3.195 0.001406 ** 
## genderfemale:raceblack       25184.59    2982.88   8.443  < 2e-16 ***
## genderfemale:racehispanic    17213.80    3523.87   4.885 1.06e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51630 on 6331 degrees of freedom
## Multiple R-squared:  0.1688, Adjusted R-squared:  0.1675 
## F-statistic: 128.6 on 10 and 6331 DF,  p-value: < 2.2e-16
kable(coef(summary(lm.4)), digits = c(0, 0, 2, 4))
Estimate Std. Error t value Pr(>|t|)
(Intercept) 84405 2341 36.06 0.0000
genderfemale -27093 1313 -20.63 0.0000
raceblack -18556 1562 -11.88 0.0000
racehispanic -10713 1795 -5.97 0.0000
jobsnum_2012 -724 94 -7.70 0.0000
expect_educationmiddle_high -26362 1327 -19.87 0.0000
expect_educationprimary -38792 10304 -3.76 0.0002
marriage_col_2000other -9353 1457 -6.42 0.0000
familysize_2012 1782 489 3.64 0.0003
kable(coef(summary(lm.4.interact)), digits = c(0, 0, 2, 4))
Estimate Std. Error t value Pr(>|t|)
(Intercept) 90031 2417 37.25 0.0000
genderfemale -37889 1821 -20.80 0.0000
raceblack -31554 2187 -14.43 0.0000
racehispanic -19553 2565 -7.62 0.0000
jobsnum_2012 -668 94 -7.13 0.0000
expect_educationmiddle_high -26221 1319 -19.88 0.0000
expect_educationprimary -39710 10249 -3.87 0.0001
marriage_col_2000other -9924 1450 -6.84 0.0000
familysize_2012 1557 487 3.19 0.0014
genderfemale:raceblack 25185 2983 8.44 0.0000
genderfemale:racehispanic 17214 3524 4.88 0.0000
Discussion; According to the k ables, when we interact gender and race, we can interpret the genderfemale:raceblak that the income difference among blackrace female and blackrace male is $225285 more than the difference between hispanic female and hispanic male. Race is a factor that is highly associated with the income gap between men and women. The coefficients for these factor variables change a lot, which further explains that race is an important factor.
# Use ANOVA to further discuss whether there is statistically significance in income gaps between different races.

anova(lm.4, lm.4.interact)
## Analysis of Variance Table
## 
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012 + gender:race
##   Res.Df            RSS Df    Sum of Sq      F    Pr(>F)    
## 1   6333 17077456571730                                     
## 2   6331 16873273379046  2 204183192684 38.306 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new.nlsy %>%
  group_by(expect_education) %>%
  summarize(income.gap = mean(income[gender == "male"]) -
              mean(income[gender == "female"]))
## # A tibble: 3 x 2
##   expect_education income.gap
##   <fct>                 <dbl>
## 1 college              35791.
## 2 middle_high          15193.
## 3 primary              16898.

Discussion: This is the imcome gap betwen different expected education level. It is amazing that the income gap within primary school group is higher than the middle_high school group, which means that with higher level of educations does not directly leads to larger income differences with other people.

# Make some error bars to better show the above table

gap <- new.nlsy %>%
  group_by(expect_education) %>%
  summarize(income.gap = mean(income[gender == "male"]) -
              mean(income[gender == "female"]))

gap<- mutate(gap,
                   expect_education = reorder(expect_education, -income.gap))

ggplot(data = gap, aes(x = expect_education, y = income.gap, fill = expect_education)) +
  geom_bar(stat = "identity") +
  xlab("expected education") + 
  ylab("Income gap by education") +
  ggtitle("Income gap between male and female, by expected education") + 
  guides(fill = FALSE)

# Calculate income gaps (male - female) and 95% confidence intervals
gap.conf <- new.nlsy %>%
  group_by(expect_education) %>%
  summarize(income.gap = mean(income[gender == "male"]) -
              mean(income[gender == "female"]),
            upper = t.test(income ~ gender)$conf.int[1],
                       lower = t.test(income ~ gender)$conf.int[2],
                       is.significant = as.numeric(t.test(income ~ gender)$p.value < 0.05))

# reorder the expected education factor according to gap size
gap.conf <- mutate(gap.conf,
                        expect_education = reorder(expect_education, income.gap))

# error bar plots
ggplot(data = gap.conf, aes(x = expect_education, y = income.gap,
                            fill = is.significant)) +
  geom_bar(stat = "identity") +
  xlab("expected education") + 
  ylab("income gap by expected education") +
  ggtitle("Income gap between male and female by expected education") + 
  guides(fill = FALSE) +
  geom_errorbar(aes(ymax = upper, ymin = lower), width = 0.1, size = 1) +
  theme(text = element_text(size=12)) 

Part 3 Findings (without topcoded income)

# The outcome variable of income in 2012 is topcoded, and we can see from the analysis above that the extreme high income values somewhat affect the results. Therefore, I choose to remove observations that earn highest income to run regression.

# Find the exact value of extreme income values
max(new.nlsy$income)
## [1] 343830
# Create a new dataframe that does not contain top 2% of income
new.nlsy.2 <- subset(new.nlsy, income!= 343830)
new.nlsy.2
## # A tibble: 6,207 x 11
##    versionid caseid birth_country expect_education race  gender income
##        <dbl>  <dbl> <fct>         <fct>            <fct> <fct>   <dbl>
##  1       445      2 Other         middle_high      other female  19000
##  2       445      3 US            college          other female  35000
##  3       445      6 US            college          other male   105000
##  4       445      8 US            college          other female  40000
##  5       445      9 US            college          other male    75000
##  6       445     14 US            college          other female 102000
##  7       445     15 US            college          other male        0
##  8       445     16 US            college          other female  70000
##  9       445     17 US            college          other male    60000
## 10       445     18 US            college          other male   150000
## # ... with 6,197 more rows, and 4 more variables: num_drug_1984 <fct>,
## #   marriage_col_2000 <fct>, familysize_2012 <dbl>, jobsnum_2012 <dbl>
# Convert variables to factors and give the factors more meaningful levels

new.nlsy.2 <- mutate(new.nlsy.2, 
                   race = recode_factor(race,
                                    `3` = "other",    
                                    `2` = "black",
                                    `1` = "hispanic"),
                   gender = recode_factor(gender, 
                                    `1` = "male",
                                    `2` = "female"))

new.nlsy.2 <- mutate(new.nlsy.2, 
                   num_drug_1984 = recode_factor(num_drug_1984,
                                    `0` = "never",
                                    `1` = "1-9",
                                    `2` = "10-39",
                                    `3` = "40-99",
                                    `4` = "100-999",
                                    `5` = "1000+"),
                   marriage_col_2000 = recode_factor(marriage_col_2000,
                                    `2` = "married",
                                    `1` = "other",
                                    `3` = "other"),
                   birth_country = recode_factor(birth_country,
                                    `1` = "US",
                                    `2` = "Other"))
                   
new.nlsy.2 <- mutate(new.nlsy.2, 
                   expect_education = recode_factor(expect_education, 
                                    `13`= "college",
                                    `14`= "college",
                                    `15`= "college",
                                    `16`= "college",
                                    `17`= "college",
                                    `18`= "college",
                                    `7`= "middle_high",
                                    `8`= "middle_high",
                                    `9`= "middle_high",
                                    `10`= "middle_high",
                                    `11`= "middle_high",
                                    `12`= "middle_high",
                                    `1`= "primary",
                                    `2`= "primary",
                                    `3`= "primary",
                                    `4`= "primary",
                                    `5`= "primary",
                                    `6`= "primary"
                                      ))

new.nlsy.2
## # A tibble: 6,207 x 11
##    versionid caseid birth_country expect_education race  gender income
##        <dbl>  <dbl> <fct>         <fct>            <fct> <fct>   <dbl>
##  1       445      2 Other         middle_high      other female  19000
##  2       445      3 US            college          other female  35000
##  3       445      6 US            college          other male   105000
##  4       445      8 US            college          other female  40000
##  5       445      9 US            college          other male    75000
##  6       445     14 US            college          other female 102000
##  7       445     15 US            college          other male        0
##  8       445     16 US            college          other female  70000
##  9       445     17 US            college          other male    60000
## 10       445     18 US            college          other male   150000
## # ... with 6,197 more rows, and 4 more variables: num_drug_1984 <fct>,
## #   marriage_col_2000 <fct>, familysize_2012 <dbl>, jobsnum_2012 <dbl>
# Make some tables to see the average income when broke down by gender and other factor variables

table.mean.income.a <- new.nlsy.2 %>%
  group_by(gender,race) %>%
  summarize(mean.income.a = round(mean(income), 0))

kable(spread(table.mean.income.a, gender, mean.income.a), 
      format = "markdown")
race male female
other 52511 32076
black 30317 24108
hispanic 39693 28021

Discussion: When we remove the topcoded income, the distribution of income among male and female is more balanced.

# Some graphics showing the relationship between gender and income

ggplot(data=new.nlsy.2, aes(x=race, y=income, colour = gender)) + geom_boxplot()

Discussion: After removing the outliers, people’s income divied by gender and race

significance level, t-test, p-value etc.

# Find whether there is any correlations between familysize and income
cor(new.nlsy.2$familysize_2012,new.nlsy.2$income)
## [1] 0.06185022
# Does the correlation vary by gender?
new.nlsy.2 %>%
  group_by(gender) %>%
  summarize(cor_income_family = cor(income, familysize_2012))
## # A tibble: 2 x 2
##   gender cor_income_family
##   <fct>              <dbl>
## 1 male              0.177 
## 2 female           -0.0358
# Does the correlation vary by education and num_drug_1984?
new.nlsy.2 %>%
  group_by(expect_education, num_drug_1984) %>%
  summarize(cor_income_family = cor(income, familysize_2012))
## # A tibble: 14 x 3
## # Groups:   expect_education [3]
##    expect_education num_drug_1984 cor_income_family
##    <fct>            <fct>                     <dbl>
##  1 college          never                  0.0555  
##  2 college          1-9                    0.0946  
##  3 college          10-39                  0.0395  
##  4 college          40-99                  0.118   
##  5 college          100-999                0.0505  
##  6 college          1000+                 -0.0231  
##  7 middle_high      never                  0.000914
##  8 middle_high      1-9                    0.0562  
##  9 middle_high      10-39                  0.131   
## 10 middle_high      40-99                  0.0831  
## 11 middle_high      100-999                0.0147  
## 12 middle_high      1000+                  0.186   
## 13 primary          never                 -0.359   
## 14 primary          1-9                   -1

Discussion: I decide to run some correlations between income, gender and other factor variables.Broken down by gender, the correlation between income and familysize within female is -0.03578633, which is suprising. Also, when broken down by expected education and number of drugs used in 1984, people who are expected to have college, havig drugs 1000+ times and who are expected to have middle_high never had drug before show negative correlation too.

# Testing differences in income bewteen male and female
qplot(x = gender, y = income,
      geom = "boxplot", data = new.nlsy.2,
      xlab = "gender", 
      ylab = "income in 2012",
      fill = I("lightblue"))

Discussion: First I use boxplot to see the distribution of income level broken down by gender. Generally, male’s income is high than female’s. The median of male’s income is around 47500 and the median of female’s income is around 25000.

# Find the mean, standard deviation and standard errors to see wether the relationship between gender and income is statistically significant
new.nlsy.2 %>%
  group_by(gender) %>%
  summarize(num.obs = n(),
            mean.income = round(mean(income), 0),
            sd.income = round(sd(income), 0),
            se.income = round(sd(income) / sqrt(num.obs), 0))
## # A tibble: 2 x 5
##   gender num.obs mean.income sd.income se.income
##   <fct>    <int>       <dbl>     <dbl>     <dbl>
## 1 male      2901       43154     38696       718
## 2 female    3306       28857     30550       531

Discussion: we can see that after removing the topcoded income values, the differences of average income between male and female is smaller.

# Run a two-sample t-test
income.t.test.a <- t.test(income ~ gender, data = new.nlsy.2)
income.t.test.a
## 
##  Welch Two Sample t-test
## 
## data:  income by gender
## t = 16, df = 5496.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  12545.62 16049.11
## sample estimates:
##   mean in group male mean in group female 
##             43154.29             28856.92

Discussion: We can see that the p value is smaller than 2.2e-16, which means that we are 95% confident that the difference in mean between male and female is statistically significant.

# p value
income.t.test.a$p.value
## [1] 2.343502e-56

Discussion: The ttest is very small.

# group means in male and female
income.t.test.a$estimate
##   mean in group male mean in group female 
##             43154.29             28856.92

Discussion: There is significant differences between average income between male and female (43154.29 and 28856.92). We are 95% confident that averge income in male is $14297.37 higher than in female.

# confidence interval for the difference
income.t.test.a$conf.int
## [1] 12545.62 16049.11
## attr(,"conf.level")
## [1] 0.95

Discussion: The confidence interval is [12545.62,16049.11]

# Also, try to run a wilcox test
income.wilcox.test.a <- wilcox.test(income ~ gender, data=new.nlsy.2, conf.int=TRUE)
income.wilcox.test.a
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  income by gender
## W = 5825100, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##   9000 13000
## sample estimates:
## difference in location 
##                  11000

Discussion: When running a wilcoxcon test, the p value is also smaller than 2.2e-16, which is statitiscally significant.

# exlpore whether data is normal
ggplot(data = new.nlsy.2, aes(sample = income)) + stat_qq() + stat_qq_line() + facet_grid(. ~ gender)

Discussion: After removing the topcoded values, the data seems more normal than before.

ANOVA

# Use ANOVA to test whether there is a significant association between gender and income
summary(aov(income ~ gender, data = new.nlsy.2))
##               Df        Sum Sq      Mean Sq F value Pr(>F)    
## gender         1  315849828529 315849828529   263.9 <2e-16 ***
## Residuals   6205 7426944326143   1196928981                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Use ANOVA to test whether there is a significant association between gender, race, expect_education and income
summary(aov(income ~ gender + race + expect_education, data = new.nlsy.2))
##                    Df        Sum Sq      Mean Sq F value Pr(>F)    
## gender              1  315849828529 315849828529   294.1 <2e-16 ***
## race                2  261357016364 130678508182   121.7 <2e-16 ***
## expect_education    2  507077934072 253538967036   236.1 <2e-16 ***
## Residuals        6201 6658509375707   1073779935                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Use ANOVA to test whether there is a significant association between gender, num_drug_1984, marriage_col_2000 and income
summary(aov(income ~ gender + num_drug_1984 + marriage_col_2000, data = new.nlsy.2))
##                     Df        Sum Sq      Mean Sq F value   Pr(>F)    
## gender               1  315849828529 315849828529  276.45  < 2e-16 ***
## num_drug_1984        5   65880909821  13176181964   11.53 4.16e-11 ***
## marriage_col_2000    1  278647001869 278647001869  243.89  < 2e-16 ***
## Residuals         6199 7082416414452   1142509504                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Discussion: We can see from the three outputs taht all p values are snaller than 2e-16, which means that the p-value is significant at the 0.05 level, there is an association between income and gender, race, num_drug_1984, marriage_col_2000 and so on.

** Linear regression**

# regress outcome variable income against all other variables

lm.a <- lm(income ~ ., data = new.nlsy.2)
lm.a
## 
## Call:
## lm(formula = income ~ ., data = new.nlsy.2)
## 
## Coefficients:
##                 (Intercept)                    versionid  
##                  65090.9297                           NA  
##                      caseid           birth_countryOther  
##                     -0.2733                    4558.5034  
## expect_educationmiddle_high      expect_educationprimary  
##                 -17465.1710                  -31992.6576  
##                   raceblack                 racehispanic  
##                 -10550.2933                   -5644.5820  
##                genderfemale             num_drug_19841-9  
##                 -15990.4213                    1845.3201  
##          num_drug_198410-39           num_drug_198440-99  
##                   5904.4073                    2605.6223  
##        num_drug_1984100-999           num_drug_19841000+  
##                   1626.4507                   -3216.5358  
##      marriage_col_2000other              familysize_2012  
##                  -7909.7019                     398.0047  
##                jobsnum_2012  
##                   -496.7083
summary(lm.a)
## 
## Call:
## lm(formula = income ~ ., data = new.nlsy.2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70273 -22912  -4687  17110 142349 
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate  Std. Error t value    Pr(>|t|)
## (Intercept)                  65090.9297   1659.8045  39.216     < 2e-16
## versionid                            NA          NA      NA          NA
## caseid                          -0.2733      0.1457  -1.875      0.0608
## birth_countryOther            4558.5034   1840.5488   2.477      0.0133
## expect_educationmiddle_high -17465.1710    832.7235 -20.974     < 2e-16
## expect_educationprimary     -31992.6576   6484.5409  -4.934 0.000000828
## raceblack                   -10550.2933   1096.8558  -9.619     < 2e-16
## racehispanic                 -5644.5820   1305.3690  -4.324 0.000015554
## genderfemale                -15990.4213    841.5838 -19.000     < 2e-16
## num_drug_19841-9              1845.3201   1042.1714   1.771      0.0767
## num_drug_198410-39            5904.4073   1412.4467   4.180 0.000029518
## num_drug_198440-99            2605.6223   1643.6832   1.585      0.1130
## num_drug_1984100-999          1626.4507   1553.5156   1.047      0.2952
## num_drug_19841000+           -3216.5358   1748.9757  -1.839      0.0659
## marriage_col_2000other       -7909.7019    913.0887  -8.663     < 2e-16
## familysize_2012                398.0047    307.1770   1.296      0.1951
## jobsnum_2012                  -496.7083     60.0314  -8.274     < 2e-16
##                                
## (Intercept)                 ***
## versionid                      
## caseid                      .  
## birth_countryOther          *  
## expect_educationmiddle_high ***
## expect_educationprimary     ***
## raceblack                   ***
## racehispanic                ***
## genderfemale                ***
## num_drug_19841-9            .  
## num_drug_198410-39          ***
## num_drug_198440-99             
## num_drug_1984100-999           
## num_drug_19841000+          .  
## marriage_col_2000other      ***
## familysize_2012                
## jobsnum_2012                ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32210 on 6191 degrees of freedom
## Multiple R-squared:  0.1705, Adjusted R-squared:  0.1685 
## F-statistic: 84.83 on 15 and 6191 DF,  p-value: < 2.2e-16
kable(summary(lm.1)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
Estimate Std. Error t value Pr(>|t|)
(Intercept) 83904.539 2634.491 31.848 0.0000
caseid -0.324 0.233 -1.394 0.1632
birth_countryOther 4245.462 2930.504 1.449 0.1475
expect_educationmiddle_high -25934.248 1327.800 -19.532 0.0000
expect_educationprimary -39654.483 10431.500 -3.801 0.0001
raceblack -17276.911 1751.408 -9.865 0.0000
racehispanic -10195.590 2080.578 -4.900 0.0000
genderfemale -27623.306 1335.661 -20.681 0.0000
num_drug_19841-9 2201.898 1659.169 1.327 0.1845
num_drug_198410-39 7427.039 2241.521 3.313 0.0009
num_drug_198440-99 2789.252 2608.926 1.069 0.2851
num_drug_1984100-999 629.600 2469.092 0.255 0.7987
num_drug_19841000+ -8180.144 2799.728 -2.922 0.0035
marriage_col_2000other -9017.580 1459.458 -6.179 0.0000
familysize_2012 1773.393 489.099 3.626 0.0003
jobsnum_2012 -698.401 96.015 -7.274 0.0000

Discussion: Now we run a regression model, regress income on gender and all other factor variables.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1685, which means that on average 17% of the variance of income can be explained by other factors.

# regress outcome variable income against gender, race, expect_education, marriage and familsize

lm.b <- lm(income ~ gender + race + expect_education + marriage_col_2000 + familysize_2012, data = new.nlsy.2)
lm.b
## 
## Call:
## lm(formula = income ~ gender + race + expect_education + marriage_col_2000 + 
##     familysize_2012, data = new.nlsy.2)
## 
## Coefficients:
##                 (Intercept)                 genderfemale  
##                     58657.3                     -15172.3  
##                   raceblack                 racehispanic  
##                    -11199.0                      -5712.6  
## expect_educationmiddle_high      expect_educationprimary  
##                    -17112.1                     -27968.6  
##      marriage_col_2000other              familysize_2012  
##                     -8801.5                        557.1
summary(lm.b)
## 
## Call:
## lm(formula = income ~ gender + race + expect_education + marriage_col_2000 + 
##     familysize_2012, data = new.nlsy.2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -63114 -22889  -5175  16785 140401 
## 
## Coefficients:
##                             Estimate Std. Error t value    Pr(>|t|)    
## (Intercept)                  58657.3     1225.3  47.870     < 2e-16 ***
## genderfemale                -15172.3      828.5 -18.313     < 2e-16 ***
## raceblack                   -11199.0      982.1 -11.403     < 2e-16 ***
## racehispanic                 -5712.6     1133.4  -5.040 0.000000478 ***
## expect_educationmiddle_high -17112.1      834.3 -20.511     < 2e-16 ***
## expect_educationprimary     -27968.6     6435.3  -4.346 0.000014078 ***
## marriage_col_2000other       -8801.5      912.7  -9.643     < 2e-16 ***
## familysize_2012                557.1      308.7   1.805      0.0711 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32470 on 6199 degrees of freedom
## Multiple R-squared:  0.156,  Adjusted R-squared:  0.1551 
## F-statistic: 163.7 on 7 and 6199 DF,  p-value: < 2.2e-16
kable(summary(lm.b)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
Estimate Std. Error t value Pr(>|t|)
(Intercept) 58657.316 1225.345 47.870 0.0000
genderfemale -15172.331 828.502 -18.313 0.0000
raceblack -11198.980 982.091 -11.403 0.0000
racehispanic -5712.615 1133.412 -5.040 0.0000
expect_educationmiddle_high -17112.123 834.304 -20.511 0.0000
expect_educationprimary -27968.635 6435.299 -4.346 0.0000
marriage_col_2000other -8801.471 912.699 -9.643 0.0000
familysize_2012 557.128 308.684 1.805 0.0711

Discussion: Now we run a regression model, regress income on gender and all other factor variables.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1551, which means that on average 15% of the variance of income can be explained by other factors.

# Plot the linear regression model
plot(lm.b)

# Collinearity and pairs plots
income.var.names.b <- c("gender", "race", "expect_education", "marriage_col_2000", "familysize_2012")

pairs(new.nlsy.2[,income.var.names.b])

Discussion: Now we run a regression model, regress income on gender race, expected eduacation, marriage status and familisize. From the residuals fitted graph, we can see that although there are some outliers, generally the residuals have comparatively fan-shape variance, it is not very constant. From the qqplot, I use a linear model for prediction even if the underlying normality assumptions do not hold. Inthis case, the model is not best fitted but seems good. Scale-location plot This is another version of the residuals vs fitted plot. For residuals vs leverage, we can see that high residuals and high leverage (outliers) skew the model fit away from the rest of data.

From the colinearity table, we can see that in this model, the colinear relationship between each variable is very weak.

# regress outcome variable income against gender, race, expect_education, drug and familsize

lm.c <- lm(income ~ gender + race + expect_education + num_drug_1984 + familysize_2012, data = new.nlsy.2)
lm.c
## 
## Call:
## lm(formula = income ~ gender + race + expect_education + num_drug_1984 + 
##     familysize_2012, data = new.nlsy.2)
## 
## Coefficients:
##                 (Intercept)                 genderfemale  
##                     54127.6                     -15973.3  
##                   raceblack                 racehispanic  
##                    -13795.0                      -6964.7  
## expect_educationmiddle_high      expect_educationprimary  
##                    -17600.6                     -28190.3  
##            num_drug_19841-9           num_drug_198410-39  
##                       686.4                       4536.8  
##          num_drug_198440-99         num_drug_1984100-999  
##                       639.6                      -1162.4  
##          num_drug_19841000+              familysize_2012  
##                     -7288.1                       1407.7
summary(lm.c)
## 
## Call:
## lm(formula = income ~ gender + race + expect_education + num_drug_1984 + 
##     familysize_2012, data = new.nlsy.2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67110 -24009  -4777  17121 141538 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  54127.6     1327.8  40.764  < 2e-16 ***
## genderfemale                -15973.3      849.4 -18.805  < 2e-16 ***
## raceblack                   -13795.0      951.7 -14.495  < 2e-16 ***
## racehispanic                 -6964.7     1135.5  -6.133 9.13e-10 ***
## expect_educationmiddle_high -17600.6      836.8 -21.033  < 2e-16 ***
## expect_educationprimary     -28190.3     6479.9  -4.350 1.38e-05 ***
## num_drug_19841-9               686.4     1048.3   0.655  0.51266    
## num_drug_198410-39            4536.8     1421.4   3.192  0.00142 ** 
## num_drug_198440-99             639.6     1652.6   0.387  0.69873    
## num_drug_1984100-999         -1162.4     1554.5  -0.748  0.45461    
## num_drug_19841000+           -7288.1     1735.8  -4.199 2.72e-05 ***
## familysize_2012               1407.7      296.5   4.747 2.11e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32630 on 6195 degrees of freedom
## Multiple R-squared:  0.1483, Adjusted R-squared:  0.1468 
## F-statistic: 98.06 on 11 and 6195 DF,  p-value: < 2.2e-16
kable(summary(lm.c)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
Estimate Std. Error t value Pr(>|t|)
(Intercept) 54127.602 1327.837 40.764 0.0000
genderfemale -15973.331 849.411 -18.805 0.0000
raceblack -13794.998 951.708 -14.495 0.0000
racehispanic -6964.739 1135.533 -6.133 0.0000
expect_educationmiddle_high -17600.633 836.821 -21.033 0.0000
expect_educationprimary -28190.254 6479.872 -4.350 0.0000
num_drug_19841-9 686.359 1048.294 0.655 0.5127
num_drug_198410-39 4536.848 1421.415 3.192 0.0014
num_drug_198440-99 639.641 1652.618 0.387 0.6987
num_drug_1984100-999 -1162.450 1554.484 -0.748 0.4546
num_drug_19841000+ -7288.087 1735.791 -4.199 0.0000
familysize_2012 1407.660 296.538 4.747 0.0000

Discussion: Now we run a regression model, regress income on gender, race, expect_eudcation, num_drug_1984 and familysize.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. But the p value for number of times having drug is very high, which suggests that drug maybe not a significant factor.The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1468, which means that on average 15% of the variance of income can be explained by other factors.

# regress outcome variable income against gender, race, expect_education, jobsnum, marriage and familysize

lm.d <- lm(income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + familysize_2012, data = new.nlsy.2)
lm.d
## 
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education + 
##     marriage_col_2000 + familysize_2012, data = new.nlsy.2)
## 
## Coefficients:
##                 (Intercept)                 genderfemale  
##                     65702.4                     -15813.7  
##                   raceblack                 racehispanic  
##                    -11722.2                      -5979.2  
##                jobsnum_2012  expect_educationmiddle_high  
##                      -497.4                     -17715.8  
##     expect_educationprimary       marriage_col_2000other  
##                    -30758.6                      -8043.8  
##             familysize_2012  
##                       394.5
summary(lm.d)
## 
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education + 
##     marriage_col_2000 + familysize_2012, data = new.nlsy.2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67366 -22943  -4603  16960 142299 
## 
## Coefficients:
##                             Estimate Std. Error t value    Pr(>|t|)    
## (Intercept)                  65702.4     1475.9  44.518     < 2e-16 ***
## genderfemale                -15813.7      827.3 -19.115     < 2e-16 ***
## raceblack                   -11722.2      978.5 -11.980     < 2e-16 ***
## racehispanic                 -5979.2     1127.5  -5.303 0.000000118 ***
## jobsnum_2012                  -497.4       58.8  -8.459     < 2e-16 ***
## expect_educationmiddle_high -17715.8      832.7 -21.276     < 2e-16 ***
## expect_educationprimary     -30758.6     6407.5  -4.800 0.000001620 ***
## marriage_col_2000other       -8043.8      912.0  -8.820     < 2e-16 ***
## familysize_2012                394.5      307.5   1.283         0.2    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32280 on 6198 degrees of freedom
## Multiple R-squared:  0.1657, Adjusted R-squared:  0.1646 
## F-statistic: 153.8 on 8 and 6198 DF,  p-value: < 2.2e-16
kable(summary(lm.d)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
Estimate Std. Error t value Pr(>|t|)
(Intercept) 65702.419 1475.878 44.518 0.0000
genderfemale -15813.737 827.309 -19.115 0.0000
raceblack -11722.201 978.506 -11.980 0.0000
racehispanic -5979.182 1127.457 -5.303 0.0000
jobsnum_2012 -497.428 58.804 -8.459 0.0000
expect_educationmiddle_high -17715.781 832.660 -21.276 0.0000
expect_educationprimary -30758.615 6407.481 -4.800 0.0000
marriage_col_2000other -8043.809 911.958 -8.820 0.0000
familysize_2012 394.504 307.543 1.283 0.1996

Discussion: Now we run a regression model, regress income on gender, race, expect_eudcation, jobsnum, marriage and familysize.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. But the p value for number of times having drug is very high, which suggests that drug maybe not a significant factor.The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1646, which means that on average 16% of the variance of income can be explained by other factors.

Interpret the coefficient of genderfemale: on average, hold other variables constant, the income of female is $15813 less than male.

# Calculate race-specific intercepts
intercepts <- c(coef(lm.d)["(Intercept)"],
                coef(lm.d)["(Intercept)"] + coef(lm.d)["raceblack"],
                coef(lm.d)["(Intercept)"] + coef(lm.d)["racehispanic"])

lines.df.d <- data.frame(intercepts = intercepts,
                       slopes = rep(coef(lm.d)["jobsnum_2012"], 3),
                       race = levels(new.nlsy.2$race))

qplot(x = jobsnum_2012, y = income, color = race, data = new.nlsy.2) + 
  geom_abline(aes(intercept = intercepts, 
                  slope = slopes, 
                  color = race), data = lines.df.d)

Interpreting the coefficient for race: The baseline race is other, therefore at this time for raceblack and racehispanic, Among people of the same jobs number in 2012, income of racehispanic people are on average higher than income of raceblack people.

# Plot the linear regression model
plot(lm.d)

# Collinearity and pairs plots
income.var.names.d <- c("gender", "race", "jobsnum_2012", "expect_education", "familysize_2012","marriage_col_2000")

pairs(new.nlsy.2[,income.var.names.d])

Discussion: Now we run a regression model, regress income on gender race, expected eduacation, marriage status and familisize. From the residuals fitted graph, we can see that although there are some outliers, generally the residuals have comparatively fan-shape variance, it is not very constant. From the qqplot, I use a linear model for prediction even if the underlying normality assumptions do not hold. Inthis case, the model is not best fitted but seems good. Scale-location plot This is another version of the residuals vs fitted plot. For residuals vs leverage, we can see that high residuals and high leverage (outliers) skew the model fit away from the rest of data.

From the colinearity table, this time although the colinear relationship between different factor varibales is still weak, it is better than former model. We can see there are some colinearity between jobsnum_2012, expected education and familysize.

Do some interactions in regression

#Whether race, marriage,familysize is a significant predictor of income differences between male and female

anova(update(lm.d, . ~ . - race), lm.d)
## Analysis of Variance Table
## 
## Model 1: income ~ gender + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012
##   Res.Df           RSS Df    Sum of Sq      F    Pr(>F)    
## 1   6200 6611478109255                                     
## 2   6198 6459998462703  2 151479646553 72.668 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(update(lm.d, . ~ . - marriage_col_2000), lm.d)
## Analysis of Variance Table
## 
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012
##   Res.Df           RSS Df   Sum of Sq      F    Pr(>F)    
## 1   6199 6541086061501                                    
## 2   6198 6459998462703  1 81087598798 77.799 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(update(lm.d, . ~ . - familysize_2012), lm.d)
## Analysis of Variance Table
## 
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012
##   Res.Df           RSS Df  Sum of Sq      F Pr(>F)
## 1   6199 6461713486522                            
## 2   6198 6459998462703  1 1715023819 1.6455 0.1996

Discussion: We can see from the three ANOVA outputs that the p values are smaller than 0.05, which means that there is statistically significance.

#interact gender with race

lm.d.interact <- update(lm.d, . ~ . + gender*race)

summary(lm.d.interact)
## 
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education + 
##     marriage_col_2000 + familysize_2012 + gender:race, data = new.nlsy.2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69751 -22376  -4625  16737 144049 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  68865.25    1530.20  45.004  < 2e-16 ***
## genderfemale                -21704.88    1155.87 -18.778  < 2e-16 ***
## raceblack                   -18980.35    1381.66 -13.737  < 2e-16 ***
## racehispanic                -10545.79    1627.03  -6.482 9.77e-11 ***
## jobsnum_2012                  -468.79      58.68  -7.990 1.60e-15 ***
## expect_educationmiddle_high -17687.62     829.01 -21.336  < 2e-16 ***
## expect_educationprimary     -31196.14    6382.10  -4.888 1.04e-06 ***
## marriage_col_2000other       -8369.27     909.02  -9.207  < 2e-16 ***
## familysize_2012                286.52     306.58   0.935     0.35    
## genderfemale:raceblack       13868.66    1872.88   7.405 1.49e-13 ***
## genderfemale:racehispanic     8714.34    2217.69   3.929 8.61e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32140 on 6196 degrees of freedom
## Multiple R-squared:  0.1733, Adjusted R-squared:  0.172 
## F-statistic: 129.9 on 10 and 6196 DF,  p-value: < 2.2e-16
kable(coef(summary(lm.d)), digits = c(0, 0, 2, 4))
Estimate Std. Error t value Pr(>|t|)
(Intercept) 65702 1476 44.52 0.0000
genderfemale -15814 827 -19.11 0.0000
raceblack -11722 979 -11.98 0.0000
racehispanic -5979 1127 -5.30 0.0000
jobsnum_2012 -497 59 -8.46 0.0000
expect_educationmiddle_high -17716 833 -21.28 0.0000
expect_educationprimary -30759 6407 -4.80 0.0000
marriage_col_2000other -8044 912 -8.82 0.0000
familysize_2012 395 308 1.28 0.1996
kable(coef(summary(lm.d.interact)), digits = c(0, 0, 2, 4))
Estimate Std. Error t value Pr(>|t|)
(Intercept) 68865 1530 45.00 0.0000
genderfemale -21705 1156 -18.78 0.0000
raceblack -18980 1382 -13.74 0.0000
racehispanic -10546 1627 -6.48 0.0000
jobsnum_2012 -469 59 -7.99 0.0000
expect_educationmiddle_high -17688 829 -21.34 0.0000
expect_educationprimary -31196 6382 -4.89 0.0000
marriage_col_2000other -8369 909 -9.21 0.0000
familysize_2012 287 307 0.93 0.3500
genderfemale:raceblack 13869 1873 7.41 0.0000
genderfemale:racehispanic 8714 2218 3.93 0.0001
Discussion; According to the k ables, when we interact gender and race, we can interpret the genderfemale:raceblak that the income difference among blackrace female and blackrace male is $13868.66 more than the difference between hispanic female and hispanic male. Race is a factor that is highly associated with the income gap between men and women. The coefficients for these factor variables change a lot, which further explains that race is an important factor.
# Use ANOVA to further discuss whether there is statistically significance in income gaps between different races.

anova(lm.d, lm.d.interact)
## Analysis of Variance Table
## 
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + 
##     familysize_2012 + gender:race
##   Res.Df           RSS Df   Sum of Sq      F    Pr(>F)    
## 1   6198 6459998462703                                    
## 2   6196 6400667700183  2 59330762520 28.717 3.854e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new.nlsy.2 %>%
  group_by(expect_education) %>%
  summarize(income.gap = mean(income[gender == "male"]) -
              mean(income[gender == "female"]))
## # A tibble: 3 x 2
##   expect_education income.gap
##   <fct>                 <dbl>
## 1 college              17756.
## 2 middle_high          12161.
## 3 primary              16898.

Discussion: This is the imcome gap betwen different expected education level. It is amazing that the income gap within primary school group is higher than the middle_high school group, which means that with higher level of educations does not directly leads to larger income differences with other people.

# Make some error bars to better show the above table

gap <- new.nlsy.2 %>%
  group_by(expect_education) %>%
  summarize(income.gap = mean(income[gender == "male"]) -
              mean(income[gender == "female"]))

gap<- mutate(gap,
                   expect_education = reorder(expect_education, -income.gap))

ggplot(data = gap, aes(x = expect_education, y = income.gap, fill = expect_education)) +
  geom_bar(stat = "identity") +
  xlab("expected education") + 
  ylab("Income gap by education") +
  ggtitle("Income gap between male and female, by expected education") + 
  guides(fill = FALSE)

# Calculate income gaps (male - female) and 95% confidence intervals
gap.conf <- new.nlsy.2 %>%
  group_by(expect_education) %>%
  summarize(income.gap = mean(income[gender == "male"]) -
              mean(income[gender == "female"]),
            upper = t.test(income ~ gender)$conf.int[1],
                       lower = t.test(income ~ gender)$conf.int[2],
                       is.significant = as.numeric(t.test(income ~ gender)$p.value < 0.05))

# reorder the expected education factor according to gap size
gap.conf <- mutate(gap.conf,
                        expect_education = reorder(expect_education, income.gap))

# error bar plots
ggplot(data = gap.conf, aes(x = expect_education, y = income.gap,
                            fill = is.significant)) +
  geom_bar(stat = "identity") +
  xlab("expected education") + 
  ylab("income gap by expected education") +
  ggtitle("Income gap between male and female by expected education") + 
  guides(fill = FALSE) +
  geom_errorbar(aes(ymax = upper, ymin = lower), width = 0.1, size = 1) +
  theme(text = element_text(size=12)) 

Part 4 Methodology

In this section you should provide an overview of the approach you took to exploring and analyzing the data. This is where you tell the story of how you got to your main findings. It’s too tedious to carefully format plots and tables for every approach you tried, so you can also use this section as a place to explain the various types of analyses that you tried.

  1. How did you deal with missing values? What impact does your approach have on the interpretation or generalizability of the resulting analysis?

In the raw data set, there are many negative values ranging from -5 to -1 (non interview, valid skip, invalid skip, dont’t know and refusal.), whcih may influence the accuracy of our analysis. Therefore, I first subset the data into a new dataframe cotaining 10+ variables. Then, I transform all the negative values into NA and remove rows that having NA, finally get a smaller dataset with 5826 observations. Removing the missing values allows me a more objective data analysis.

  1. How did you deal with topcoded variables? What impact does your approach have on the interpretation or generalizability of the resulting analysis?

First, I make some data summaries and regression analysis of the data with topcoded incomes, which shows that extreme high income (outliers) affect the causal effect between gender, eduacation, race etc. with income. Therefore, I make a box plot and max() function finding out the exact value of income in 2012, which is 343830. Then I subset the data into a new dataframe that does not contain observations with income of $343830 per year. After removing the topcoded values, the income distribution and income difference among gender and race is more interpretable. Outliers distort the coefficients of regression to some extent. But in this case, when I make regression analysis, there is no distinctive differences.

  1. Did you produce any tables or plots that you thought would reveal interesting trends but didn’t?

I think people who frequently have drugs may have lower income. However, it shows that male born in US having some experience of drug even have higher income.

  1. What relationships did you investigate that don’t appear in your findings section

I investigate the relationship between familysize and expect_education, the r squared is 0.0018.

  1. What’s the analysis that you finally settled on? What income and gender related factors do you investigate in the final analysis?

The final analysis that I settled on is setting income as outcome varibale, gender, race, expected education, familisize in 2012, number of times drug used in 1984, number of jobs had in 2012 as related factors to investigate in the final analysis. Expected eudcation, race and familysize are very important factors affecting income by gender. And generally male earn more than female.

Part 5 Discussion

  1. In this section you should summarize your main conclusions. You should also discuss potential limitations of your analysis and findings. Are there potential confounders that you didn’t control for? Are the models you fit believable?

Potential limitations including: (1) Most factors are categorical variables, only familysize and number of jobs are numeric, compared to the large numeric value of income, which may distort the objective of analysis. (2) The survey contains questions from 1979 to 2012, it is hard for the long-period continuous interviews to keep consistent, which may meet the situations like duplication and inconsistency. Therefore, I should have more considerations on selecting factor variables that are more likely to have causal impacts on the income in 2012. (Selecting questions that were interviewed in 2012 ) (3) The classification of education level, college, middle_high school, primary school needs more discussion. I recode the 18 levels of education into three levels and baseline variable is college. Maybe it is unnecessary to do that. (4) have tried different factor variables, but r squares are very similar, which means these factors are all important.

Potential confounders: Maybe the number of children one has, one’s parents’ job type.

The models I fit believable.

  1. You should also address the following question: How much confidence do you have in your analysis? Do you believe your conclusions? Are you confident enough in your analysis and findings to present them to policy makers?

Probably I am 90% of confident in my analysis. I believe my conclusions. I am confident enough to present them to policy makers.